{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.model_selection import cross_val_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"labeleddataformodel.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[['title', 'isupliftinganecdote']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.767142585189\n",
      "[ 0.78640777  0.72815534  0.75        0.81862745  0.74509804  0.78431373\n",
      "  0.76470588  0.7745098   0.75490196  0.76470588]\n"
     ]
    }
   ],
   "source": [
    "countvec = CountVectorizer(max_features = 3603)\n",
    "#countvec = CountVectorizer(stop_words = \"english\", max_features = i)\n",
    "vectors = countvec.fit_transform(data['title'])\n",
    "labels = data['isupliftinganecdote']\n",
    "mnb = MultinomialNB()\n",
    "mnb.fit(vectors, labels)\n",
    "scores = cross_val_score(mnb, vectors, labels, cv=10)\n",
    "print(sum(scores) / len(scores))\n",
    "print(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "def rand_bin_array(K, N):\n",
    "    arr = np.zeros(N)\n",
    "    arr[:K]  = 1\n",
    "    np.random.shuffle(arr)\n",
    "    return arr\n",
    "\n",
    "y_true = rand_bin_array(1022,2044)\n",
    "#print(array)\n",
    "\n",
    "listofrandomlabels=y_true.tolist()\n",
    "#print(list1)\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 2044 entries, 0 to 2043\n",
      "Data columns (total 3 columns):\n",
      "title                  2044 non-null object\n",
      "isupliftinganecdote    2044 non-null float64\n",
      "randomlabels           2044 non-null float64\n",
      "dtypes: float64(2), object(1)\n",
      "memory usage: 48.0+ KB\n"
     ]
    }
   ],
   "source": [
    "randomlabeldata = data\n",
    "randomlabeldata['randomlabels'] = listofrandomlabels\n",
    "randomlabeldata.head()\n",
    "randomlabeldata.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[ 1.  1.  0. ...,  0.  0.  0.]\n",
      "[ 0.  0.  1. ...,  0.  1.  0.]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.51027397260273977"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#this way you run the classifier you just trained on the data with random labels and find accuracy. 50%\n",
    "from sklearn.metrics import accuracy_score\n",
    "vectors2 = countvec.fit_transform(randomlabeldata['title'])\n",
    "y_pred = mnb.predict(vectors2)\n",
    "#y_pred = mnb.predict(vectors)\n",
    "#could do either way, i guess (above); same thing\n",
    "print(y_pred)\n",
    "print(y_true)\n",
    "accuracy_score(y_true, y_pred)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.511812297735\n",
      "[ 0.49514563  0.45631068  0.53431373  0.5         0.50490196  0.48039216\n",
      "  0.50490196  0.52941176  0.54411765  0.56862745]\n"
     ]
    }
   ],
   "source": [
    "#alternative way is to train the classifier on the data with random labels instead of just running it; not sure which is what you're meant to do, also works\n",
    "countvec = CountVectorizer(max_features = 3603)\n",
    "#countvec = CountVectorizer(stop_words = \"english\", max_features = i)\n",
    "vectors3 = countvec.fit_transform(randomlabeldata['title'])\n",
    "labels2 = randomlabeldata['randomlabels']\n",
    "mnb2 = MultinomialNB()\n",
    "mnb2.fit(vectors3, labels2)\n",
    "scores = cross_val_score(mnb2, vectors3, labels2, cv=10)\n",
    "print(sum(scores) / len(scores))\n",
    "print(scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
